In [1]:
# Old libraries that we know and love.
import numpy as np
import matplotlib.pylab as py
import pandas as pa
%matplotlib inline
# Our new libraries.
from sklearn import cross_validation, linear_model, feature_selection, metrics
import mayavi.mlab as mlab
In [2]:
# Read in the data using
Xy = pa.read_csv('Advertising.csv')
In [3]:
# Normalize data
# We do this to make plotting and processing easier. Many Sklearn functions do this
# for you behind the scenes, but we do it explicitly.
# Note, that this is a cousing of the physics idea of nondimensionalization. Think
# about the case where TV was measured in millions, while Radio was measured in
# thousands. One could imagine TV totally washing out the effect of Radio.
# In effect, after normalization, each predictor now stands on an "even footing".
#
# Is this always a good idea?
Xy = (Xy-Xy.min())/(Xy.max()-Xy.min())
In [4]:
# Since we will be plotting many times, we define a function.
def myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot,size=10,scale_factor=0.05):
mlab.clf()
mlab.points3d(np.array(X_train)[:,0],
np.array(X_train)[:,1],
np.array(y_train)[:,0],
color=(1,0,0), scale_factor=scale_factor)
mlab.points3d(np.array(X_test)[:,0],
np.array(X_test)[:,1],
np.array(y_test)[:,0],
color=(0,1,0), scale_factor=scale_factor)
mlab.mesh(xPlot,yPlot,zPlot,color=(0,0,1))
mlab.axes()
mlab.show()
In [5]:
# Now we try non-linear fittng. See notes for details.
# Note that we add a new column which is a *non-linear* function
# of the original data!
XyNonlinear = Xy.copy()
XyNonlinear['TV*Radio'] = Xy['TV']*Xy['Radio']
# Select out our predictor columns and our response columns
X = XyNonlinear.ix[:,['TV','Radio','TV*Radio']]
y = XyNonlinear.ix[:,['Sales']]
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,
y,
test_size=0.8,
random_state=123)
In [6]:
# Run the solver
reg = linear_model.LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)
Out[6]:
In [7]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
zPlot = reg.predict(np.transpose(np.array([xPlot.flatten(),
yPlot.flatten(),
(xPlot*yPlot).flatten()])))
zPlot = zPlot.reshape([size,size])
In [8]:
myPlot(reg,X_train,y_train,X_test,y_test,xPlot,yPlot,zPlot)
In [9]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,reg.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,reg.predict(X_test))
In [10]:
# What about adding many non-linear combinations! See notes for details.
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [11]:
# Run the solver
regOver = linear_model.LinearRegression(fit_intercept=True)
regOver.fit(X_train,y_train)
Out[11]:
In [12]:
print regOver.intercept_
print regOver.coef_
In [13]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = regOver.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [14]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [15]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regOver.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regOver.predict(X_test))
In [16]:
# Fortunately, there is a *lot* that one can do to help. It is possible to have
# many predictors but still get good answers. See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
names = []
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
names.append('TV**%d*Radio**%d'%(i,j))
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [18]:
# We can try None and 3 to see what we get.
selector = feature_selection.RFE(regOver,n_features_to_select=3)
selector.fit(X_train,y_train)
Out[18]:
In [19]:
# Print out the predictors we use. These are the predictors selection by the RFE algorithm
# as the most important.
for i in range(len(names)):
print names[i],
print selector.get_support()[i]
In [20]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = selector.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [21]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [100]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,selector.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,selector.predict(X_test))
In [22]:
# Lasso regression is another method for doing feature selection.
# It is, by far, by favorite it is a close cousin of my personal
# research topic. See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
names = []
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
names.append('TV**%d*Radio**%d'%(i,j))
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [23]:
# Run the solver
# alpha=0.1 is good
regRidge = linear_model.Ridge(alpha=0.1,fit_intercept=True,normalize=True)
regRidge.fit(X_train,y_train)
Out[23]:
In [24]:
# Print out the predictors we use. These betas with non-zero weights are those
# selected by the Lasso algorithm as being the most important. What do you notice?
print regRidge.intercept_
for i in range(len(regRidge.coef_[0])):
print names[i],regRidge.coef_[0][i]
py.plot(regRidge.coef_[0])
Out[24]:
In [25]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = regRidge.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [26]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [106]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regRidge.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regRidge.predict(X_test))
In [28]:
# Lasso regression is another method for doing feature selection.
# It is, by far, by favorite it is a close cousin of my personal
# research topic. See notes for details...
degree=5
XCrazy = np.zeros([Xy.shape[0],degree**2])
names = []
for i in range(degree):
for j in range(degree):
XCrazy[:,i*degree + j] = (Xy['TV']**i)*(Xy['Radio']**j)
names.append('TV**%d*Radio**%d'%(i,j))
# Select subsets for training and testing
X_train,X_test,y_train,y_test = cross_validation.train_test_split(XCrazy,
y,
test_size=0.8,
random_state=123)
In [29]:
# Run the solver
regLasso = linear_model.Lasso(alpha=0.002,fit_intercept=True,normalize=True)
regLasso.fit(X_train,y_train)
Out[29]:
In [30]:
# Print out the predictors we use. These betas with non-zero weights are those
# selected by the Lasso algorithm as being the most important. What do you notice?
print regLasso.intercept_
for i in range(len(regLasso.coef_)):
print names[i],regLasso.coef_[i]
py.plot(regLasso.coef_)
Out[30]:
In [31]:
# Create data for plotting
size = 10
xPlot,yPlot = np.meshgrid(np.linspace(0,1,size),
np.linspace(0,1,size))
tmp = []
for i in range(degree):
for j in range(degree):
tmp.append( ( (xPlot**i)*(yPlot**j) ).flatten() )
zPlot = regLasso.predict(np.transpose(np.array(tmp)))
zPlot = zPlot.reshape([size,size])
In [32]:
# Plot the data
# Select subsets for training and testing
X_train_plot,X_test_plot = cross_validation.train_test_split(Xy.ix[:,['TV','Radio']],
test_size=0.8,
random_state=123)
myPlot(reg,X_train_plot,y_train,X_test_plot,y_test,xPlot,yPlot,zPlot)
In [112]:
# Use the metrics package to print our errors
print 'training error'
print metrics.mean_squared_error(y_train,regLasso.predict(X_train))
print 'testing error'
print metrics.mean_squared_error(y_test,regLasso.predict(X_test))
In [27]:
2**20
Out[27]:
In [ ]: